Candace Savonen - CCDL for ALSF
This notebook is a first pass in evaluating the concordance between MuTect2 and Strelka2 results.
It addresses issue # 30 in OpenPBTA.
* Some of these set up steps will be removed once we have a Dockerfile that installs maftools automatically.
if (!('colorblindr' %in% installed.packages())) {
devtools::install_github("clauswilke/colorblindr")
}
Downloading GitHub repo clauswilke/colorblindr@master
Installing 6 packages: cowplot, shiny, httpuv, sourcetools, later, promises
cannot open URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.6/PACKAGES.rds': HTTP status was '404 Not Found'trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.6/cowplot_1.0.0.tgz'
Content type 'application/x-gzip' length 1362732 bytes (1.3 MB)
==================================================
downloaded 1.3 MB
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.6/shiny_1.3.2.tgz'
Content type 'application/x-gzip' length 4655011 bytes (4.4 MB)
==================================================
downloaded 4.4 MB
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.6/httpuv_1.5.1.tgz'
Content type 'application/x-gzip' length 2936016 bytes (2.8 MB)
==================================================
downloaded 2.8 MB
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.6/sourcetools_0.1.7.tgz'
Content type 'application/x-gzip' length 138829 bytes (135 KB)
==================================================
downloaded 135 KB
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.6/later_0.8.0.tgz'
Content type 'application/x-gzip' length 385957 bytes (376 KB)
==================================================
downloaded 376 KB
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.6/promises_1.0.1.tgz'
Content type 'application/x-gzip' length 307177 bytes (299 KB)
==================================================
downloaded 299 KB
The downloaded binary packages are in
/var/folders/1c/kxsx50393_g8qkhd83sktgm80000gn/T//RtmpQ451K5/downloaded_packages
checking for file ‘/private/var/folders/1c/kxsx50393_g8qkhd83sktgm80000gn/T/RtmpQ451K5/remotesda2b47fcecea/clauswilke-colorblindr-1ac3d4d/DESCRIPTION’ ...
[32m✔[39m [38;5;247mchecking for file ‘/private/var/folders/1c/kxsx50393_g8qkhd83sktgm80000gn/T/RtmpQ451K5/remotesda2b47fcecea/clauswilke-colorblindr-1ac3d4d/DESCRIPTION’[39m[36m[39m
[38;5;247m─[39m[38;5;247m [39m[38;5;247mpreparing ‘colorblindr’:[39m[36m[39m
checking DESCRIPTION meta-information ...
[32m✔[39m [38;5;247mchecking DESCRIPTION meta-information[39m[36m[39m
[38;5;247m─[39m[38;5;247m [39m[38;5;247mchecking for LF line-endings in source and make files and shell scripts[39m[36m[39m
[38;5;247m─[39m[38;5;247m [39m[38;5;247mchecking for empty or unneeded directories[39m[36m[39m
[38;5;247m─[39m[38;5;247m [39m[38;5;247mbuilding ‘colorblindr_0.1.0.tar.gz’[39m[36m[39m
File /Users/candacesavonen/.Renviron contains invalid line(s)
GITHUB_PATbb0bf7f0c887a3d0e5b68ddb8749bb30986ca69f
They were ignored
* installing *source* package ‘colorblindr’ ...
** using staged installation
** R
** inst
** byte-compile and prepare package for lazy loading
File /Users/candacesavonen/.Renviron contains invalid line(s)
GITHUB_PATbb0bf7f0c887a3d0e5b68ddb8749bb30986ca69f
They were ignored
** help
*** installing help indices
*** copying figures
** building package indices
** installing vignettes
** testing if installed package can be loaded from temporary location
File /Users/candacesavonen/.Renviron contains invalid line(s)
GITHUB_PATbb0bf7f0c887a3d0e5b68ddb8749bb30986ca69f
They were ignored
** testing if installed package can be loaded from final location
File /Users/candacesavonen/.Renviron contains invalid line(s)
GITHUB_PATbb0bf7f0c887a3d0e5b68ddb8749bb30986ca69f
They were ignored
** testing if installed package keeps a record of temporary installation path
* DONE (colorblindr)
Create a output directories.
if (!dir.exists("results")) {
dir.create("results")
}
if (!dir.exists("plots")) {
dir.create("plots")
}
This will be changed once the data download script is finalized.
Prep the metadata to be used as the clinicalData for maftools it it hasn’t been prepped yet. This whole chunk may need to be removed be taken out after issue 2 regarding the data download script is addressed.
These steps are only to make the metadata and MuTect2 and Strelka2 datasets parallel in what samples they contain.
There are some samples in the metadata that are not in the MuTect2 and Strelka2 data.
if (file.exists(file.path("..", "..", "scratch", "metadata_filtered.tsv"))) {
metadata <- readr::read_tsv(file.path("..", "..", "scratch", "metadata_filtered.tsv"))
}
Parsed with column specification:
cols(
sample_id = [31mcol_character()[39m,
patient_id = [31mcol_character()[39m,
sample_type = [31mcol_character()[39m,
Composition = [31mcol_character()[39m,
disease_type = [31mcol_character()[39m,
`Tumor Descriptor` = [31mcol_character()[39m,
primary_site = [31mcol_character()[39m,
gender = [31mcol_character()[39m,
race = [31mcol_character()[39m,
ethnicity = [31mcol_character()[39m,
age_at_diagnosis = [32mcol_double()[39m
)
Will read in as an maftools object from an RDS file, unless maftools has not been run on them yet. Running maftools takes a lot of computing power and time, so be prepared. If you trying to run this step in a Docker container, it will likely be out of memory killed, unless you have ~50GB you can allot to Docker.
# Load in the RDS file if it exists, but if it doesn't exist, load in from
# original data file with maftools
if (!file.exists(file.path("..", "..", "scratch", "strelka2.RDS"))) {
strelka <- maftools::read.maf(file.path("..", "..", "data", "strelka2.maf.gz"),
clinicalData = metadata)
saveRDS(strelka, file.path("..", "..", "scratch", "strelka2.RDS"))
} else {
strelka2 <- readRDS(file.path("..", "..", "scratch", "strelka2.RDS"))
}
This is how I set up metadata_filtered.tsv. This should not need to be ran again and won’t run again unless metadata_filtered.tsv is misplaced from the scratch file.
if (!file.exists(file.path("..", "..", "scratch", "metadata_filtered.tsv"))) {
# Isolate metadata to only the samples that are in the datasets
metadata <- metadata %>%
dplyr::filter(sample_id %in% mutect2@clinical.data$Tumor_Sample_Barcode) %>%
dplyr::distinct(sample_id, .keep_all = TRUE) %>%
readr::write_tsv(file.path("..", "..", "scratch", "metadata_filtered.tsv"))
}
Check that samples are same order in the datasets as they are in the metadata
all.equal(as.factor(metadata$sample_id),
strelka2@clinical.data$Tumor_Sample_Barcode)
[1] TRUE
all.equal(as.factor(metadata$sample_id),
mutect2@clinical.data$Tumor_Sample_Barcode)
[1] TRUE
Get gene summaries and write to TSV files.
strelka2_gene_sum <- maftools::getGeneSummary(strelka2) %>%
readr::write_tsv(file.path("results",
"strelka2_gene_summary.tsv"))
mutect2_gene_sum <- maftools::getGeneSummary(mutect2) %>%
readr::write_tsv(file.path("results",
"mutect2_gene_summary.tsv"))
Get sample summaries and write to TSV files.
strelka2_sample_sum <- maftools::getSampleSummary(strelka2) %>%
readr::write_tsv(file.path("results",
"strelka2_sample_summary.tsv"))
mutect2_sample_sum <- maftools::getSampleSummary(mutect2) %>%
readr::write_tsv(file.path("results",
"mutect2_sample_summary.tsv"))
combined_gene <- mutect2_gene_sum %>%
dplyr::full_join(strelka2_gene_sum, by = 'Hugo_Symbol') %>%
reshape2::melt(id = 'Hugo_Symbol') %>%
dplyr::mutate(dataset = as.character(grepl(".x$", variable))) %>%
dplyr::mutate(dataset = dplyr::recode(dataset,
`TRUE` = "mutect2",
`FALSE` = "strelka2")) %>%
dplyr::mutate(variable = gsub(".x$|.y$", "", variable)) %>%
tidyr::spread('dataset', 'value')
Make number of mutations per gene scatterplots.
gene_cor <- ggplot2::ggplot(combined_gene, ggplot2::aes(x = mutect2, y = strelka2)) +
ggplot2::geom_hex(bins = 10) +
ggplot2::facet_wrap(~variable, scales = "free") +
ggplot2::geom_smooth(method = lm) +
ggplot2::theme_classic() +
ggplot2::xlab("Mutect2: Number of mutations per gene") +
ggplot2::ylab("Strelka2: Number of mutations per gene")
# Print out the plot in this notebook
gene_cor
Save the plot to a PDF.
ggplot2::ggsave(file.path("plots", "gene_cor_mutect2_vs_strelka2.pdf"))
Saving 7 x 7 in image
Let’s get a correlation test on the genes overall.
cor.test(combined_gene$mutect2, combined_gene$strelka2, method = "spearman")
Cannot compute exact p-value with ties
Spearman's rank correlation rho
data: combined_gene$mutect2 and combined_gene$strelka2
S = 4.2762e+13, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
0.9604808
cor.test(combined_gene$mutect2, combined_gene$strelka2, method = "pearson")
Pearson's product-moment correlation
data: combined_gene$mutect2 and combined_gene$strelka2
t = 580.42, df = 186550, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.8006256 0.8038602
sample estimates:
cor
0.8022488
combined_sample <- mutect2_sample_sum %>%
dplyr::full_join(strelka2_sample_sum, by = 'Tumor_Sample_Barcode') %>%
reshape2::melt(id = 'Tumor_Sample_Barcode') %>%
dplyr::mutate(dataset = as.character(grepl(".x$", variable))) %>%
dplyr::mutate(dataset = dplyr::recode(dataset,
`TRUE` = "mutect2",
`FALSE` = "strelka2")) %>%
dplyr::mutate(variable = gsub(".x$|.y$", "", variable)) %>%
tidyr::spread('dataset', 'value')
Make number of mutations per sample scatterplots.
sample_cor <- ggplot2::ggplot(combined_sample, ggplot2::aes(x = mutect2, y = strelka2)) +
ggplot2::geom_hex(bins = 10) +
ggplot2::facet_wrap(~variable, scales = "free") +
ggplot2::geom_smooth(method = lm) +
ggplot2::theme_classic() +
ggplot2::xlab("Mutect2: Number of mutations per sample") +
ggplot2::ylab("Strelka2: Number of mutations per sample")
# Print out the plot in this notebook
sample_cor
Save the plot to a PDF.
ggplot2::ggsave(file.path("plots", "sample_cor_mutect2_vs_strelka2.pdf"))
Saving 7 x 7 in image
Let’s get a correlation test on the genes overall. Question 2) Is this a reasonable amount of concordance for these methods across samples?
cor.test(combined_sample$mutect2, combined_sample$strelka2, method = "spearman")
Cannot compute exact p-value with ties
Spearman's rank correlation rho
data: combined_sample$mutect2 and combined_sample$strelka2
S = 2.8267e+10, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
0.7749395
cor.test(combined_sample$mutect2, combined_sample$strelka2, method = "pearson")
Pearson's product-moment correlation
data: combined_sample$mutect2 and combined_sample$strelka2
t = 720.76, df = 9098, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.9909958 0.9917034
sample estimates:
cor
0.9913568
Set up data.frame with mutation and VAF for Strelka2.
strelka2_vaf <- strelka2@data %>%
dplyr::filter(!grepl("-", Allele)) %>%
dplyr::mutate(vaf = as.numeric(t_alt_count)/(as.numeric(t_ref_count) +
as.numeric(t_alt_count)),
mutation = paste0(Hugo_Symbol, "_",
Allele, "_",
Tumor_Sample_Barcode, "_",
Start_Position),
change = paste0(Reference_Allele, ">", Allele),
coding = dplyr::case_when(
BIOTYPE != "protein_coding" ~ "non-coding",
TRUE ~ "protein_coding")) %>%
dplyr::select(-which(apply(is.na(.), 2, all)))
NAs introduced by coercion
# Take a look at this df
strelka2_vaf
Set up data.frame with mutation and VAF for MuTect2.
mutect2_vaf <- mutect2@data %>%
dplyr::filter(!grepl("-", Allele)) %>%
dplyr::mutate(vaf = as.numeric(t_alt_count)/(as.numeric(t_ref_count) +
as.numeric(t_alt_count)),
mutation = paste0(Hugo_Symbol, "_",
Allele, "_",
Tumor_Sample_Barcode, "_",
Start_Position),
change = paste0(Reference_Allele, ">", Allele),
coding = dplyr::case_when(
BIOTYPE != "protein_coding" ~ "non-coding",
TRUE ~ "protein_coding")) %>%
dplyr::select(-which(apply(is.na(.), 2, all)))
# Take a look at this df
mutect2_vaf
Combine MuTect2 and Strelka2 VAF data.frames so we can compare.
# Merge these data.frames together
vaf_df <- strelka2_vaf %>%
dplyr::full_join(mutect2_vaf, by = 'mutation',
suffix = c(".strelka2", ".mutect2")) %>%
# Make a variable that denotes which dataset it is in.
dplyr::mutate(dataset = dplyr::case_when(
is.na(vaf.mutect2) ~ "strelka2_only",
is.na(vaf.strelka2) ~ "mutect2_only",
TRUE ~ "both"))
Plot this as a scatterplot
vaf_df %>%
ggplot2::ggplot(ggplot2::aes(x = vaf.strelka2, y = vaf.mutect2)) +
ggplot2::geom_hex() +
ggplot2::theme_classic() +
ggplot2::xlab("VAF for each mutation for Strelka2") +
ggplot2::ylab("VAF for each mutation for MuTect2")
vaf_df %>%
tidyr::gather(key = "data", value = "vaf" , vaf.strelka2, vaf.mutect2) %>%
dplyr::mutate(data = gsub("^vaf.", "", data)) %>%
dplyr::mutate(data.group = paste(dataset, ":", data, "VAF")) %>%
dplyr::filter(!is.na(vaf)) %>%
# Plot it
ggplot2::ggplot(ggplot2::aes(data.group, vaf)) +
ggplot2::geom_violin(fill = "light blue") +
ggplot2::theme_classic( ) +
ggplot2::ylab("Density of VAF") +
ggplot2::xlab(" ")
Make the Venn diagram of MuTect2 and Strelka2 mutations.
count <- summary(as.factor(vaf_df$dataset))
# Take a look at this summary
count
both mutect2_only strelka2_only
62181 23121 35012
# Make the Venn diagram
grid::grid.newpage();
venn.plot <- VennDiagram::draw.pairwise.venn(
area1 = count[3] + count[1],
area2 = count[2] + count[1],
cross.area = count[1],
category = c("Strelka2", "MuTect2"),
fill = c("blue", "yellow"),
cex = 2,
cat.cex = 1.5,
cat.dist = c(-0.04, -0.031),
ext.pos = 0,
ext.dist = -0.01,
ext.length = .8,
ext.line.lwd = 2,
ext.line.lty = "dashed");
grid::grid.draw(venn.plot) # Draw plot
Save the Venn Diagram plot to a PDF.
# Make filename to save plot as
venn.plot.file <- file.path("plots",
"strelka2_mutect2_venn_diagram.pdf")
pdf(venn.plot.file);
grid::grid.draw(venn.plot);
dev.off()
null device
1
Let’s make a wrapper function that will do this for whatever variables we are interested in.
barplot_var(vaf_df, "change", 100)
barplot_var(vaf_df, "coding", 10)
barplot_var("Variant_Classification", 10)
Session Info:
sessionInfo()